{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from os import listdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"Claudia_output\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].endswith(\"txt.csv\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(files)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summary_file(file_name,name):\n",
    "    df = pd.read_csv(file_name,sep=',',skiprows=1,header=None)\n",
    "    df.columns = [\"kmer\",\"Met\",\"Unmet\",\"Total\",\"Perc_Met\"]\n",
    "    kmer_temp = df.loc[:,[\"kmer\",\"Total\",\"Perc_Met\"]]\n",
    "    met_sum = df.loc[:,\"Met\"].sum()\n",
    "    tot_sum = df.loc[:,\"Total\"].sum()\n",
    "    per_met = met_sum*100/tot_sum\n",
    "    kmer_temp.loc[\"Sum\"] = [\"Total\", tot_sum, per_met]\n",
    "    kmer_temp[\"Perc_Nor\"] = kmer_temp[\"Perc_Met\"] - per_met\n",
    "    kmer_temp.columns = [\"kmer\",str(name)+\"_Total\",str(name)+\"_Perc_met\",str(name)+\"Perc_Norm\"]\n",
    "    return(kmer_temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='Claudia_output'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "#output_file.to_csv(\"Claudia_summary_results.csv\",index=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(257, 562)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output_file.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.to_csv(\"Claudia_summary_total.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.to_csv(\"Claudia_summary_meth.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.to_csv(\"Claudia_summary_norm.csv\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_TET3_84h_2June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_TET3_84h_2June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_TET3_84h_2June2020.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Rosie files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"Rosie_output\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].endswith(\"txt.csv\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "#inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(files)):\n",
    "    name = inputs[i].replace('CpG_context_','')\n",
    "    name = name.replace('_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='Rosie_output'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "#output_file.to_csv(\"Rosie_summary_results.csv\",index=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(257, 199)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output_file.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "#output_file_total.to_csv(\"Rosie_summary_total.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "#output_file_meth.to_csv(\"Rosie_summary_meth.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "#output_file_norm.to_csv(\"Rosie_summary_norm.csv\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_Decitabine_84_2June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_Decitabine_2June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_Decitabine_2June2020_.csv\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tim old files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = listdir(\"TET3_data\")\n",
    "inputs = []\n",
    "for i in range(len(files)):\n",
    "    if files[i].endswith(\".csv\") == True:\n",
    "        inputs.append(files[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = []\n",
    "for i in range(len(inputs)):\n",
    "    name = inputs[i]\n",
    "    name = name.replace('.csv','')\n",
    "    names.append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_input='TET3_data'\n",
    "\n",
    "output_file = summary_file(str(folder_input)+\"/\"+str(inputs[0]),names[0])\n",
    "for i in range(1,len(inputs)):\n",
    "    temp_output_file = summary_file(str(folder_input)+\"/\"+str(inputs[i]),names[i])\n",
    "    output_file = output_file.merge(temp_output_file, left_on='kmer', right_on='kmer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(257, 193)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output_file.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_t = output_file.T\n",
    "output_file_t.columns = output_file_t.iloc[0]\n",
    "output_file_t = output_file_t.drop(output_file_t.index[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_file_total = output_file_t.iloc[list(range(0,len(output_file_t),3)),:]\n",
    "output_file_total.T.to_csv(\"Motif_calls_TET3_72h_2June2020.csv\")\n",
    "output_file_meth = output_file_t.iloc[list(range(1,len(output_file_t),3)),:]\n",
    "output_file_meth.T.to_csv(\"Motif_meth_TET3_72h_2June2020.csv\") \n",
    "output_file_norm = output_file_t.iloc[list(range(2,len(output_file_t),3)),:]\n",
    "output_file_norm.T.to_csv(\"Motif_norm_TET3_72h_2June2020.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
